In [1]:
import pandas as pd
from pandas import DataFrame
import os
import re
import numpy as np
import cPickle as pickle
from collections import Counter
from sklearn.manifold import TSNE
import nltk

one time methods only


In [18]:
def get_all_topics():
    genre = np.array(['tech', 'politics', 'music', 'sports'])
    tech = np.array(['@microsoft', 'nokia', 'amazon', 'amazon prime', 'amazon prime day', 'apple', 'apple watch', 'ipad', 'iphone', 'ipod', 'oracle', 'ibm', 'nintendo', 'moto g', 'google', 'google +', 'ps4', 'netflix'])                        
    politics = np.array(['angela merkel',  'bernie sanders', 'david cameron',' donald trump', 'hillary', 'joe biden', 'michelle obama', 'obama', 'rahul gandhi', 'tony blair'])
    music = np.array(['bee gees', 'beyonce', 'bob marley', 'chris brown', 'david bowie', 'katy perry',  'ed sheeran', 'foo fighters', 'janet jackson', 'lady gaga', 'michael jackson',  'ac/dc', 'the vamps', 'iron maiden', 'rolling stone', 'jay-z', 'snoop dogg', 'nirvana'])
    sports = np.array(['arsenal', 'barca', 'federer', 'floyd mayweather', 'hulk hogan', 'john cena', 'kris bryant', 'randy orton', 'real madrid', 'serena', 'messi', 'david beckham', 'rousey', 'super eagles', 'kane', 'red sox', 'white sox'])
    all_topics = np.concatenate((tech, politics, music, sports))
    return [all_topics, genre]

In [14]:
def word2topic_preprocess():
    word2topic = pickle.load(open("word2topic", "r"))
    keys = word2topic.keys()
    return keys

In [ ]:
def getTopicId(topic):
    return all_topics.tolist().index(topic)

In [16]:
def word2vec_preprocess(df):
    from gensim.models import Word2Vec
    import logging
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
        level=logging.INFO)
    import gensim.models.word2vec as wv
    model = wv.Word2Vec(df["tokenized_sents"], size=100, window=5, min_count=5, workers=4)
    model.save("word2vec")
    model = Word2Vec.load("word2vec")
    model.init_sims(replace=True)
    return model

In [ ]:
#add one hot 

def one_hot_encoding():
    one_hot = pd.get_dummies(all_topics)
    return one_hot

In [ ]:

other methods


In [15]:
def getEmbeddingWord2Topic(sentence) :  
    num_of_words =30
    size_of_vec = 100
    embedding_size = num_of_words * size_of_vec
    list = np.array([])
    for word in sentence:
        if word in keys:
             list = np.append(list, word2topic[word])
    if(list.size > embedding_size):
        list = list[0:embedding_size]
    pad = np.zeros(embedding_size - list.size)
    list = np.append(list, pad)
    return list

In [17]:
def getEmbedding(sentence, ) :  
    num_of_words =30
    size_of_vec = 100
    embedding_size = num_of_words * size_of_vec
    list = np.array([])
    for word in sentence:
        if word in model.wv.vocab:
             list = np.append(list, model.wv[word])
    if(list.size > embedding_size):
        list = list[0:embedding_size]
    pad = np.zeros(embedding_size - list.size)
    list = np.append(list, pad)
    return list

In [ ]:
def logistic_regression(X,Y):
    print "logistic Regression"
    from sklearn.linear_model import LogisticRegression
    logregr = LogisticRegression()
    logregr.fit(X, Y)
    pred = logregr.predict(X_test)

In [20]:
def add_features_to_df():
    #filter and add topic id 
    df_filter = df[df["topic"].isin(all_topics)]
    topics_array = np.array(([tech, politics, music, sports]))
    df_filter['topic_id'] = df_filter['topic'].apply(getTopicId)
    return df_filter

In [21]:
def add_one_hot_encoding():
    df_filter['vector'] = df_filter['embedding'] # + one_hot[df['topic']].T
    for index, row in df_filter.iterrows():
        one_hot_encoding  =  one_hot[row['topic']]
        row['vector'] = np.concatenate([row['vector'], one_hot_encoding])

In [ ]:
[X, y, df, d] = pickle.load(open("data_rnn", "r"))
[all_topics,genre] = get_all_topics()
df['tokenized_sents'] = df.apply(lambda row: nltk.word_tokenize(row['tweet']), axis=1)

In [ ]:
##to get word2Vec embedding 
## preprocessing 
model =word2vec_preprocess(df)
df['embedding'] = df['tokenized_sents'].apply(getEmbedding)

In [22]:
keys = word2topic_preprocess()
df['embedding'] = df['tokenized_sents'].apply(getEmbeddingWord2Topic)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-22-2cddfd535115> in <module>()
      1 keys = word2topic_preprocess()
----> 2 df['embedding'] = df['tokenized_sents'].apply(getEmbeddingWord2Topic)

NameError: name 'df' is not defined

In [ ]:
df_filter = add_features_to_df()
prepare test and train data

In [ ]:
import copy
#new_list = copy.deepcopy(old_list)
X =copy.deepcopy( np.vstack(df_filter['embedding'][0:5000]))
X_test = copy.deepcopy(np.vstack(df_filter['embedding'][5001:6357]))
Y = copy.deepcopy(df_filter['sentiment'][0:5000])
Y_test=copy.deepcopy(df_filter['sentiment'][5001:6357])